# This script prepares the different imputed and oversampled training datasets to develop the early life CAPE models. 
# These early life training datasets prepared in this script will have had the following optimisation techniques applied: MICE imputation > ADASYN oversampling
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "MICE_imputed_standardised_earlylife_training_dataset_1113ID.csv" is found in IOWBC_imputed_data.xlsx, sheet: "Standardised MICE earlylife tr"
# The data in files named "MICE_imputed_oversampled_earlylife_dataset_XXX.csv" were developed using the script "Data_preparation_CAPE_imputation_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np


# Import datasets
data_0 = pd.read_csv("MICE_imputed_standardised_earlylife_training_dataset_1113ID.csv", index_col=False)
del data_0['Unnamed: 0']
data_25 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_25%.csv", index_col=False)
data_50 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_50%.csv", index_col=False)
data_100 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_100%.csv", index_col=False)
data_150 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_150%.csv", index_col=False)
data_200 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_200%.csv", index_col=False)
data_250 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_250%.csv", index_col=False)
data_300 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_300%.csv", index_col=False)

# Remove extra synthetic cases produced from each dataset and assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25.iloc[0:1155,:]);
data.append(data_50.iloc[0:1197,:]);
data.append(data_100.iloc[0:1280,:]);
data.append(data_150.iloc[0:1364,:]);
data.append(data_200.iloc[0:1447,:]);
data.append(data_250.iloc[0:1531,:]);
data.append(data_300.iloc[0:1614,:])

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7]

# Import early life test data, standardised against the imputed early life training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "Standardised earlylife test set"
test = pd.read_csv("Early_life_MICE_standardised_test_dataset_255IDs.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']


